## R Script Output -------------------------------------------------------------
# Appendix Figure A5: State-Level Correlations Between Chinese International Undergraduate Students and FREI


## Instructions ----------------------------------------------------------------
# Step 1: Adjust MAIN_DIR to where README.txt is located
# Step 2: Run entire script


## setup -----------------------------------------------------------------------
# clean slate
rm(list = ls())
date()

# load packages
pkg <- c("tidyverse",
         "RColorBrewer", 
         "gridExtra", 
         "viridis",
         "choroplethrZip")

lapply(pkg, require, character.only = TRUE)

# set main directory
MAIN_DIR <- "~/Dropbox/Research/ISQ-frei-replication/"


## load data -------------------------------------------------------------------
load(file = paste(MAIN_DIR, "foia-zcta.RData", sep = ""))

# load NAR data
nar.dest <- read_csv(file = paste(MAIN_DIR, "cn-frei-dest.csv", sep = ""))

# add state abbreviation
nar.dest <- nar.dest %>%
  mutate(state_abb = state.abb[match(nar.dest$state, state.name)])

# get zcta characteristics
data(zip.regions)

zip.regions <- zip.regions %>%
  group_by(region) %>%
  slice(1) %>%
  ungroup()


## Figure A5 -------------------------------------------------------------------
# create variable and subset
cn.ug <- foia.zcta %>%
  rename(Secondary = secondary,
         Undergraduate = undergraduate,
         Graduate = graduate) %>%
  group_by(cty, year) %>%
  mutate(Total = Secondary + Undergraduate + Graduate) %>%
  gather(student_level, n, -zcta, -year, -cty, -iso3n) %>%
  ungroup() %>%
  filter(cty == "CHINA" & student_level == "Undergraduate") 

# merge with state
cn.ug <- left_join(cn.ug, zip.regions,
                   by = c("zcta" = "region"))

# drop puerto rico and northern mariana islands
cn.ug <- cn.ug %>% 
  filter(!is.na(state.name))

# aggregate up
cn.ug.state <- cn.ug %>%
  group_by(year) %>%
  mutate(n_year = sum(n, na.rm = TRUE)) %>%
  ungroup() %>%
  group_by(state.name, year) %>%
  summarize(n_state_year = sum(n, na.rm = TRUE),
            n_year = first(n_year)) %>%
  ungroup() %>%
  mutate(ug_percent = n_state_year / n_year * 100,
         state = str_to_title(state.name))

# create full state-year placeholder
cn.ug.state.temp <- cn.ug.state %>% 
  expand(state, year = 2000:2020)

# merge
cn.ug.state <- right_join(cn.ug.state, cn.ug.state.temp,
                          by = c("state", "year")) %>%
  arrange(state, year)

# add 2012 student share variable
cn.ug.state <- cn.ug.state %>%
  group_by(state) %>%
  mutate(ug_percent_12 = ug_percent[year == 2012]) %>%
  ungroup()

# add state abb
cn.ug.state$state_abb <- state.abb[match(cn.ug.state$state, state.name)]

# check NAs for abbreviations
cn.ug.state %>% 
  filter(is.na(state_abb))

# fill in DC abbreviation
cn.ug.state <- cn.ug.state %>%
  mutate(state_abb = if_else(state == "District Of Columbia", "DC", state_abb))

# merge in NAR data
cn.ug.nar <- full_join(cn.ug.state, nar.dest,
                       c("state_abb", "year", "state")) %>%
  rename(frei_percent = percent) 

cn.ug.nar <- cn.ug.nar %>%
  filter(year >= 2013) %>%
  mutate(state_year = paste(state_abb, year, sep = "-"))
    
# plot
base.size <- 16

cn.scatter <- ggplot(cn.ug.nar, 
                     aes(x = ug_percent_12, 
                         y = frei_percent)) +
  geom_point(alpha = 0.6) + 
  geom_smooth(method = lm, color = "#E41A1C") +
  scale_x_continuous(limits = c(0, 20)) + 
  #scale_y_continuous(limits = c(0, 60)) + 
  scale_y_continuous(limits = c(-10, 70),
                     breaks = c(0, 20, 40, 60),
                     labels = c(0, 20, 40, 60)) + 
  facet_grid(~ year) +
  xlab("State Shares of Chinese Int'l Undergraduate Students in 2012 (%)") + 
  ylab("State Shares of Chinese FREI (%)") + 
  theme_bw() + theme(plot.title = element_text(lineheight = .8, face = "bold", 
                                               margin = margin(0, 0, 20, 0)),
                     axis.title.x = element_text(size = base.size,
                                                 margin = margin(20, 0, 0, 0)),
                     axis.title.y = element_text(size = base.size,
                                                 margin = margin(0, 20, 0, 0)),
                     axis.text.x = element_text(size = base.size - 3),
                     axis.text.y = element_text(size = base.size - 3),
                     strip.text = element_text(size = base.size + 5),
                     strip.background = element_blank(),
                     panel.border = element_rect(colour = "black"),
                     legend.key.size = unit(0.3, "inch"),
                     legend.key = element_rect(colour = "white"),
                     legend.text = element_text(size = base.size - 3),
                     legend.title = element_blank(),
                     panel.grid.minor = element_blank())

# add annotation
ann.text.ca <- tibble(ug_percent_12 = rep(cn.ug.nar %>% 
                                            filter(state_abb == "CA") %>%
                                            pull(ug_percent_12) %>%
                                            unique(), 
                                          n_distinct(cn.ug.nar$year)),  
                      frei_percent = cn.ug.nar %>% 
                        filter(state_abb == "CA") %>% 
                        pull(frei_percent) + 3,
                      lab = rep("CA", n_distinct(cn.ug.nar$year)),
                      year = sort(unique(cn.ug.nar$year)))

cn.scatter <- cn.scatter + geom_text(data = ann.text.ca,
                                     label = ann.text.ca$lab)

ann.text.ny <- tibble(ug_percent_12 = rep(cn.ug.nar %>% 
                                            filter(state_abb == "NY") %>%
                                            pull(ug_percent_12) %>%
                                            unique() + 2, 
                                          n_distinct(cn.ug.nar$year)),  
                      frei_percent = cn.ug.nar %>% 
                        filter(state_abb == "NY") %>% 
                        pull(frei_percent) + 2,
                      lab = rep("NY", n_distinct(cn.ug.nar$year)),
                      year = sort(unique(cn.ug.nar$year)))
ann.text.ny

cn.scatter <- cn.scatter + geom_text(data = ann.text.ny,
                                     label = ann.text.ny$lab)

# get slope
slope.df <- map_df(sort(unique(cn.ug.nar$year)), function(x) {
  sub.df <- cn.ug.nar %>% filter(year == x)
  
  coef <- round(coefficients(lm(frei_percent ~ ug_percent_12, sub.df)), 2)
  
  return(coef)
})

ann.text.slope <- tibble(ug_percent_12 = rep(17.5, 8),
                         frei_percent = rep(2, 8),
                         lab = paste("beta == ", slope.df$ug_percent_12, sep = ""),
                         year = seq(2013, 2020, by = 1))

cn.scatter <- cn.scatter + geom_text(data = ann.text.slope,
                                     label = ann.text.slope$lab,
                                     parse = TRUE)

cn.scatter

# print
pdf(file = paste(MAIN_DIR, "Figure-A5.pdf", sep = "/"),
    width = 16, height = 4)
print(cn.scatter)
dev.off()

